{
 "metadata": {
  "name": "",
  "signature": "sha256:e652be74844b894d16a9d4190b78c306f63deee717eaa25c02c268ad7c24623c"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data_file = \"/nfs/data/KDD99/kdd_cup.data\"\n",
      "raw_data = sc.textFile(data_file)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# count by all different labels and print them decreasingly\n",
      "print \"Counting all different labels\"\n",
      "labels = raw_data.map(lambda line: line.strip().split(\",\")[-1])\n",
      "label_counts = labels.countByValue()\n",
      "sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))\n",
      "for label, count in sorted_labels.items():\n",
      "    print label, count"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}